This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(fpp2)
## Loading required package: forecast
## Loading required package: fma
## Loading required package: expsmooth
library(xray)
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.5.2
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
dataset <- read.csv('TT2/data2.csv')
summary(dataset)
## Financial_Year Branch_Code Sequence_1 Sequence_2
## Min. :14.00 BRC-01:193503 Min. : 345 Min. : 1
## 1st Qu.:15.00 BRC-02: 25126 1st Qu.: 11289 1st Qu.: 7572
## Median :16.00 BRC-03: 1251 Median : 22564 Median :16388
## Mean :15.87 BRC-05: 239 Mean : 34540 Mean :17716
## 3rd Qu.:17.00 BRC-06: 1763 3rd Qu.: 35659 3rd Qu.:27430
## Max. :18.00 BRC-07: 3616 Max. :122099 Max. :89061
## NA's :2
## Donation_type Donor_Age Donation_Date Gender
## R :119386 Min. : -60.04 26-SEP-13: 281 : 14
## T : 64420 1st Qu.: 23.50 22-AUG-13: 269 F: 129
## M : 19709 Median : 28.00 19-SEP-13: 266 M:225355
## N : 15304 Mean : 29.40 28-AUG-13: 253
## P : 6598 3rd Qu.: 33.00 23-AUG-13: 251
## A : 67 Max. :2002.34 06-OCT-13: 249
## (Other): 14 NA's :16455 (Other) :223929
## Blood_Group_Code Donor_Weight Donor_Temperature Donor_Pulse
## Min. : 0.0 Min. :-80.00 Min. : 0.30 Min. : 0.00
## 1st Qu.: 3.0 1st Qu.: 65.00 1st Qu.: 37.00 1st Qu.: 72.00
## Median : 3.0 Median : 72.00 Median : 37.00 Median : 72.00
## Mean : 3.6 Mean : 73.73 Mean : 38.02 Mean : 74.57
## 3rd Qu.: 5.0 3rd Qu.: 80.00 3rd Qu.: 37.00 3rd Qu.: 72.00
## Max. :17.0 Max. :980.00 Max. :999.00 Max. :872.00
## NA's :539 NA's :32827 NA's :32830 NA's :32829
## Donor_Hemoglobin Donor_Blood_Pressure Test_1 C1
## Min. : 0.13 120/80 :179530 Min. : 0.0000 : 2498
## 1st Qu.: 13.00 : 32831 1st Qu.: 0.1230 I: 146
## Median : 13.00 120/70 : 2092 Median : 0.1830 N:222846
## Mean : 13.34 80/120 : 805 Mean : 0.3376 R: 8
## 3rd Qu.: 13.00 120/90 : 780 3rd Qu.: 0.2940
## Max. :714.00 120//80: 773 Max. :1183.0220
## NA's :32830 (Other): 8687 NA's :2563
## Test_2 C2 Test_3 Test_4
## Min. : 0.001 : 2614 : 262 : 660
## 1st Qu.: 0.183 I: 3761 N:221563 N:224836
## Median : 0.222 N:219068 P: 293 P: 1
## Mean : 47.756 P: 2 R: 3380 R: 1
## 3rd Qu.: 0.264 R: 53
## Max. :8071.204
## NA's :2646
# extracting the main columns from dataset
# for time series analysis
time_data <- dataset %>%
select(Branch_Code,Donation_type,Donation_Date)
head(time_data)
## Branch_Code Donation_type Donation_Date
## 1 BRC-01 R 01-JUL-13
## 2 BRC-01 R 01-JUL-13
## 3 BRC-01 R 01-JUL-13
## 4 BRC-01 R 01-JUL-13
## 5 BRC-01 T 01-JUL-13
## 6 BRC-01 T 01-JUL-13
summary(time_data)
## Branch_Code Donation_type Donation_Date
## BRC-01:193503 R :119386 26-SEP-13: 281
## BRC-02: 25126 T : 64420 22-AUG-13: 269
## BRC-03: 1251 M : 19709 19-SEP-13: 266
## BRC-05: 239 N : 15304 28-AUG-13: 253
## BRC-06: 1763 P : 6598 23-AUG-13: 251
## BRC-07: 3616 A : 67 06-OCT-13: 249
## (Other): 14 (Other) :223929
# converting date column from dataset
time_data$Donation_Date <- as.Date(time_data$Donation_Date, format = "%d-%b-%y")
head(time_data)
## Branch_Code Donation_type Donation_Date
## 1 BRC-01 R 2013-07-01
## 2 BRC-01 R 2013-07-01
## 3 BRC-01 R 2013-07-01
## 4 BRC-01 R 2013-07-01
## 5 BRC-01 T 2013-07-01
## 6 BRC-01 T 2013-07-01
# remove the missing values from data
# ther is almost 3~4 values are missing in selected data
time_data <- na.omit(time_data)
# check min and max date from dataset
min(time_data$Donation_Date)
## [1] "1974-04-04"
max(time_data$Donation_Date)
## [1] "2018-07-06"
# Starting date from the finincial_year which is July 1st
time_data <- time_data %>%
filter(Donation_Date >= "2013-07-01")
# select data of specific branch
# because there are different branch in dataset
# and different count of people goto different branch
# we select 1st branch
branch_data_fun <- function(branch){
brac_data <- time_data %>%
filter(Branch_Code %in% branch)
}
# call above function that subset data on Branch column
branch_data <- branch_data_fun("BRC-01")
head(branch_data)
## Branch_Code Donation_type Donation_Date
## 1 BRC-01 R 2013-07-01
## 2 BRC-01 R 2013-07-01
## 3 BRC-01 R 2013-07-01
## 4 BRC-01 R 2013-07-01
## 5 BRC-01 T 2013-07-01
## 6 BRC-01 T 2013-07-01
# creating a data.frame that contain just the date and no donor that
# came to specific branch for donation
agg_brac_date <-aggregate(x = branch_data[,3],
by = list(Date = branch_data$Donation_Date),
FUN = length)
head(agg_brac_date)
## Date x
## 1 2013-07-01 98
## 2 2013-07-02 135
## 3 2013-07-03 121
## 4 2013-07-04 113
## 5 2013-07-05 122
## 6 2013-07-06 102
# checking if there is any missing date from the
# sequence which is not included in above data frame
alldates = seq(min(agg_brac_date$Date), max(agg_brac_date$Date), 1)
# checking if there is some date that is not included in above data frame
dates0 = alldates[!(alldates %in% agg_brac_date$Date)]
# creating a data.frame of missing dates
data0 <- data.frame()
if(length(dates0)!=0){
data0 <- data.frame(Date = dates0,x = 0)
}
# bind the missing date data to orignal data in data.frame
# it contains the date and no of donor came on that date
full_data_branch = rbind(agg_brac_date, data0)
# Order the data by date column
full_data_branch = full_data_branch[order(full_data_branch$Date),]
head(full_data_branch)
## Date x
## 1 2013-07-01 98
## 2 2013-07-02 135
## 3 2013-07-03 121
## 4 2013-07-04 113
## 5 2013-07-05 122
## 6 2013-07-06 102
# printing the outliers
outliers <- boxplot.stats(full_data_branch$x)$out
outliers
## [1] 184 211 228 203 189 195 189 190 207 235 202 183 233 184 184 203 183
## [18] 191 197 183 221 195 223 192 195 191 184 26 0 2 1 1 1 1
# ploting the boxplot to check the outliers
hcboxplot(x = full_data_branch$x) %>%
hc_chart(type = "column")
# calculating the mean and median of data
# then find the average of both of them
# and replace it with the outliers
meann <- mean(full_data_branch$x)+median(full_data_branch$x)
meann <- meann/2
meann
## [1] 104.3114
# data without outliers
xx <- full_data_branch %>%
filter(!full_data_branch$x %in% c(outliers))
# data with outliers
yy <- full_data_branch %>%
filter(full_data_branch$x %in% outliers)
# replace outlier with the average of mean and median
yy$x <- meann
# append both xx,yy dataframe
# bind the clean and smooth data
full_data_branch <- rbind(xx,yy)
# Data after smoothing
head(full_data_branch,10)
## Date x
## 1 2013-07-01 98
## 2 2013-07-02 135
## 3 2013-07-03 121
## 4 2013-07-04 113
## 5 2013-07-05 122
## 6 2013-07-06 102
## 7 2013-07-07 99
## 8 2013-07-08 143
## 9 2013-07-09 124
## 10 2013-07-10 156
my_ts_br <- ts(full_data_branch$x , start = c(2013,7), frequency = 365)
# Summary of data
summary(my_ts_br)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.0 86.0 103.0 104.6 122.0 181.0
# head of data
head(my_ts_br,20)
## Time Series:
## Start = c(2013, 7)
## End = c(2013, 26)
## Frequency = 365
## [1] 98 135 121 113 122 102 99 143 124 156 93 110 97 86 120 100 127
## [18] 146 144 110
# Dividing the dataset in Train and Test Data
# --------
train <- window(
my_ts_br,
end=c(2017,5))
test <- window(
my_ts_br,
start=c(2017,6))
# Head of train data
head(train)
## Time Series:
## Start = c(2013, 7)
## End = c(2013, 12)
## Frequency = 365
## [1] 98 135 121 113 122 102
# Head of test data
head(test)
## Time Series:
## Start = c(2017, 6)
## End = c(2017, 11)
## Frequency = 365
## [1] 120 117 131 96 112 120
# Decomposing time series object to Analyze dataset
decomp <- stl(log(my_ts_br), "per")
summary(decomp)
## Call:
## stl(x = log(my_ts_br), s.window = "per")
##
## Time.series components:
## seasonal trend remainder
## Min. :-0.3898126 Min. :4.508096 Min. :-1.1864296
## 1st Qu.:-0.0876822 1st Qu.:4.559432 1st Qu.:-0.1258090
## Median : 0.0025861 Median :4.609144 Median : 0.0178137
## Mean : 0.0002303 Mean :4.616294 Mean :-0.0016538
## 3rd Qu.: 0.0943307 3rd Qu.:4.661327 3rd Qu.: 0.1476118
## Max. : 0.3099108 Max. :4.790346 Max. : 0.6414138
## IQR:
## STL.seasonal STL.trend STL.remainder data
## 0.1820 0.1019 0.2734 0.3497
## % 52.1 29.1 78.2 100.0
##
## Weights: all == 1
##
## Other components: List of 5
## $ win : Named num [1:3] 18321 549 365
## $ deg : Named int [1:3] 0 1 1
## $ jump : Named num [1:3] 1833 55 37
## $ inner: int 2
## $ outer: int 0
# Look at trend, season and remainder to observe data
ggplotly(autoplot(decomp))
# Plot the time series data
plot_ly(x = ~full_data_branch\(Date, y = ~full_data_branch\)x, mode = ‘lines’) %>% layout( title = " Sales“, xaxis = list( title =”Date“, rangeselector = list( buttons = list( list( count = 3, label =”3 mo“, step =”month“, stepmode =”backward“), list( count = 6, label =”6 mo“, step =”month“, stepmode =”backward“), list( count = 1, label =”1 yr“, step =”year“, stepmode =”backward“), list( count = 1, label =”YTD“, step =”year“, stepmode =”todate“), list(step =”all"))),
rangeslider = list(type = "date")),
yaxis = list(title = "Count"))
# It will use to plot the data over different years
ggplotly(ggseasonplot(my_ts_br,polar = F))
# These are some functions to calculata accuracy
# -----------------------------
# Mean Square Error
MSE <- function(y,yhat)
{
mean((y-yhat)**2)
}
# Mean absolute Error
MAE <- function(y,yhat)
{
mean(abs(y-yhat))
}
# Mean absolute precentage error
MAPE <- function(y,yhat,percent=TRUE)
{
if(percent){
100*mean(abs( (y-yhat)/y ))
} else {
mean(abs( (y-yhat)/y ))
}
}
MAE <- function(y,yhat)
{
mean(abs(y-yhat))
}
# Mean absolute precentage error
MAPE <- function(y,yhat,percent=TRUE)
{
if(percent){
100*mean(abs( (y-yhat)/y ))
} else {
mean(abs( (y-yhat)/y ))
}
}
# create models_accuracy dataframe
models_accuracy <- data.frame(modelname = as.character(),mse=as.character(),mae=as.character(),mape=as.character())
# This is our 2nd model
# Which is Stlf model
stlf.fit <- stlf(train,h=length(test),robust = T,lambda = "auto")
yHat <- stlf.fit$mean
mse <- MSE(test,yHat)
mae <- MAE(test,yHat)
mape <- MAPE(test,yHat)
models_accuracy <- data.frame(model.name="STLF",mse = mse, mae = mae, mape = mape)
ggplotly(autoplot(stlf.fit))
# simple Forcasting
forecas.fit <- forecast(train,h = length(test))
yHat <- forecas.fit$mean
mse <- MSE(test,yHat)
mae <- MAE(test,yHat)
mape <- MAPE(test,yHat)
models_accuracy <- rbind(models_accuracy,data.frame(model.name = "simple forecas",mse=mse , mae = mae,mape = mape))
ggplotly(autoplot(forecas.fit))
# This will predict using neural network
nn.fit <- forecast(nnetar(train, lambda=0.9,),h=length(test))
yHat <- nn.fit$mean
mse <- MSE(test,yHat)
mae <- MAE(test,yHat)
mape <- MAPE(test,yHat)
ggplotly(autoplot(nn.fit))
models_accuracy <- rbind(models_accuracy,data.frame(model.name = "Neural network",mse=mse , mae = mae,mape = mape))
# Accuracy of different models are
models_accuracy
## model.name mse mae mape
## 1 STLF 727.3467 21.52041 27.00349
## 2 simple forecas 799.8863 22.38884 29.00822
## 3 Neural network 542.6518 18.54749 20.78775